#!/usr/bin/env python
# -*- coding: utf-8 -*-

""" Basic text segmenters."""

from icu import Locale, BreakIterator
from polyglot.base import Sequence


class Breaker(object):
  """ Base class to segment text."""

  def __init__(self, locale):
    self.locale = Locale('locale')
    self.breaker = None

  def transform(self, sequence):
    seq = Sequence(sequence.text)
    seq.idx = [0]
    for segment in sequence:
      offset = seq.idx[-1]
      self.breaker.setText(segment)
      seq.idx.extend([offset+x for x in self.breaker])
    return seq

 
class SentenceTokenizer(Breaker):
  """ Segment text to sentences. """

  def __init__(self, locale='en'):
    super(SentenceTokenizer, self).__init__(locale)
    self.breaker = BreakIterator.createSentenceInstance(self.locale)


class WordTokenizer(Breaker):
  """ Segment text to words or tokens."""

  def __init__(self, locale='en'):
    super(WordTokenizer, self).__init__(locale)
    self.breaker = BreakIterator.createWordInstance(self.locale)